load packages and config file

osuRepo = 'http://ftp.osuosl.org/pub/cran/'

if(!require(tidyverse)){
  install.packages('tidyverse',repos=osuRepo)
}
if(!require(caret)){
  install.packages('caret',repos=osuRepo)
}
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'default/
## America/Los_Angeles'
if(!require(caTools)){
  install.packages('caTools',repos=osuRepo)
}
if(!require(reshape2)){
  install.packages('reshape2',repos=osuRepo)
}
source('config.R')

load confound files

file_list = list.files(confoundDir, pattern = 'confounds.tsv', recursive = TRUE)

if (!file.exists("~/Documents/code/auto-motion-fmriprep/tag.csv")) {
  for (file in file_list){
    # if the merged dataset doesn't exist, create it
    if (!exists('dataset')){
      filePattern = paste(subPattern, wavePattern, taskPattern, runPattern, 'bold_confounds.tsv', sep = "_")
      dataset = read_tsv(paste0(confoundDir, file)) %>% 
        mutate(file = file) %>%
        extract(file, c('sub', 'wave', 'task', 'run'),
                file.path('sub-.*','ses-wave.*', 'func', filePattern)) %>%
        mutate(wave = as.integer(wave),
               run = as.integer(run),
               stdDVARS = as.numeric(ifelse(stdDVARS %in% "n/a", NA, stdDVARS)),
               `non-stdDVARS` = as.numeric(ifelse(`non-stdDVARS` %in% "n/a", NA, `non-stdDVARS`)),
               `vx-wisestdDVARS` = as.numeric(ifelse(`vx-wisestdDVARS` %in% "n/a", NA, `vx-wisestdDVARS`)),
               FramewiseDisplacement = as.numeric(ifelse(FramewiseDisplacement %in% "n/a", NA, FramewiseDisplacement)),
               volume = row_number()) %>%
        select(sub, wave, task, run, volume, everything())
    }
  
    # if the merged dataset does exist, append to it
    else {
      filePattern = paste(subPattern, wavePattern, taskPattern, runPattern, 'bold_confounds.tsv', sep = "_")
      tmp = read_tsv(paste0(confoundDir, file)) %>% 
        mutate(file = file) %>%
        extract(file, c('sub', 'wave', 'task', 'run'),
                file.path('sub-.*','ses-wave.*', 'func', filePattern)) %>%
        mutate(wave = as.integer(wave),
               run = as.integer(run),
               stdDVARS = as.numeric(ifelse(stdDVARS %in% "n/a", NA, stdDVARS)),
               `non-stdDVARS` = as.numeric(ifelse(`non-stdDVARS` %in% "n/a", NA, `non-stdDVARS`)),
               `vx-wisestdDVARS` = as.numeric(ifelse(`vx-wisestdDVARS` %in% "n/a", NA, `vx-wisestdDVARS`)),
               FramewiseDisplacement = as.numeric(ifelse(FramewiseDisplacement %in% "n/a", NA, FramewiseDisplacement)),
               volume = row_number()) %>%
        select(sub, wave, task, run, volume, everything())
      dataset = bind_rows(dataset, tmp)
      rm(tmp)
    }
  }
} else {
  dataset = read.csv("~/Documents/code/auto-motion-fmriprep/tag.csv", stringsAsFactors = FALSE)
}

load hand coded data and merge

For TAG, we don’t have hand coded data so we’re just going to make up whether the volume is coded as trash or not

trash = data.frame(trash = rbinom(nrow(dataset), 1, .01))

dataset = bind_cols(dataset, trash) %>%
  select(sub, wave, task, run, volume, trash, everything())

visualize confounds

data.plot = dataset %>%
  gather(feature, value, -c(sub, wave, task, run, volume, trash))

# ggplot(data.plot, aes(1, value)) +
#   geom_boxplot() + 
#   facet_wrap(~feature, scales = "free", ncol = 4)

ggplot(data.plot, aes(value)) + 
  geom_density(fill = "pink") +
  facet_wrap(~feature, scales = "free", ncol = 4)
## Warning: Removed 868411 rows containing non-finite values (stat_density).

visualize confounds on a subject level

features = c("CSF", "WhiteMatter", "GlobalSignal", "FramewiseDisplacement", "stdDVARS", "tCompCor00", "aCompCor00")
subs = unique(dataset$sub)[1:5]

data.sub = data.plot %>%
  filter(feature %in% features) %>%
  mutate(sort = ifelse(feature == "CSF", 1,
                ifelse(feature == "GlobalSignal", 2,
                ifelse(feature == "WhiteMatter", 3,
                ifelse(feature == "FramewiseDisplacement", 4,
                ifelse(feature == "stdDVARS", 5,
                ifelse(feature == "tCompCor00", 6,
                ifelse(feature == "aCompCor00", 7, NA)))))))) %>%
  filter(sub %in% subs)

# ggplot(data.sub, aes(x = volume, y = value)) +
#   geom_line(aes(color = feature), size = .25) +
#   facet_wrap(reorder(feature, sort) ~ task + run, ncol = 4, scales = "free") +
#   scale_color_manual(values = wesanderson::wes_palette("Zissou", 7, type = "continuous"))

nada = data.sub %>% 
  group_by(sub) %>%
    do({
      plot = ggplot(., aes(volume, value)) + 
        geom_line(aes(color = feature), size = .25, show.legend = FALSE) +
        facet_wrap(reorder(feature, sort) ~ task + run, ncol = 4, scales = "free") +
        scale_color_manual(values = wesanderson::wes_palette("Zissou", 7, type = "continuous"))
        labs(title = .$sub[[1]])
      print(plot)
      #ggsave(plot, file = paste0(plotDir,.$sub[[1]],'.pdf'), height = 10, width = 12)
      data.frame()
    })
## Warning: Removed 2 rows containing missing values (geom_path).

## Warning: Removed 2 rows containing missing values (geom_path).

## Warning: Removed 2 rows containing missing values (geom_path).

## Warning: Removed 2 rows containing missing values (geom_path).

## Warning: Removed 2 rows containing missing values (geom_path).

correlate confounds

# define color palette
palette = c("#0071CF", "#FFD700", "#E2261C")

# subset and matrix-ify
dataset.corr = dataset %>%
  select(-c(sub, wave, task, run, volume, trash, starts_with("NonSteady"))) %>%
  cor(., use = "pairwise.complete.obs") %>%
  melt(.)

# plot the data
ggplot(dataset.corr, aes(as.factor(Var2), as.factor(Var1), fill = value)) +
  geom_tile(color = "white")+
  scale_fill_gradient2(low = palette[1], high = palette[3], mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="pearson\ncorrelation") +
  labs(x = "",
       y = "") +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1)) + 
  geom_text(aes(label = round(value,1)), size = 2.5)

questions

  • Are the values meaningful across participants and runs? Or should they be scaled within subject and run?
  • How are rp features defined? From first volume or volume-to-volume?
  • aCompCor = signal associated with physiology
  • DVARS
  • STD –> more interpretable across subjects

split the data

set.seed(101) 
sample = sample.split(dataset$trash, SplitRatio = .75)
training = subset(dataset, sample == TRUE)
testing = subset(dataset, sample == FALSE)

machine learning

svm

training

# train.svm = training %>%
#   select(-c(sub, wave, task, run, volume, starts_with("NonSteady"))) %>%
#   mutate(trash = ifelse(trash == 1, "yes","no"),
#          trash = as.factor(trash))
# 
# # specify control parameters
# fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 3, classProbs = TRUE)
# 
# # run initial model and print
# set.seed(101)
# svmFit = train(trash ~ ., 
#                data = train.svm, 
#                method = "svmLinear", 
#                trControl = fitControl,
#                preProcess = c("center", "scale"),
#                tuneLength = 10,
#                metric = "ROC",
#                verbose = FALSE,
#                na.action = na.pass)
# svmFit$finalModel
# 
# # predict model
# train_pred = predict(svmFit, newdata = train.svm, type="prob") %>%
#   select(-no)
# 
# # plot cutoff v. accuracy
# predicted = prediction(train_pred, train.svm$artifact, label.ordering = NULL)
# perf = performance(predicted, measure = "acc")
# perf.df = data.frame(cut=perf@x.values[[1]],acc=perf@y.values[[1]])
# 
# ggplot(perf.df, aes(cut, acc)) +
#   geom_line()
# 
# # plot false v. true positive rate
# perf = performance(predicted, measure = "tpr", x.measure = "fpr")
# perf.df = data.frame(cut=perf@alpha.values[[1]],fpr=perf@x.values[[1]],tpr=perf@y.values[[1]])
# 
# ggplot(perf.df, aes(fpr, tpr)) +
#   geom_line()
# 
# # plot specificity v. sensitivity
# perf = performance(predicted, measure = "sens", x.measure = "spec")
# perf.df = data.frame(cut=perf@alpha.values[[1]],sens=perf@x.values[[1]],spec=perf@y.values[[1]])
# ggplot(perf.df, aes(spec, sens)) +
#   geom_line()
# 
# ggplot(perf.df, aes(x = cut)) +
#   geom_line(aes(y = sens, color = "sensitivity")) + 
#   geom_line(aes(y = spec, color = "specificity"))
# 
# perf@alpha.values[[1]][which.max(perf@x.values[[1]]+perf@y.values[[1]])] # cut
# max(perf@x.values[[1]]+perf@y.values[[1]]) # sensitivity + specificity
# 
# # cut and assess accuracy in training sample
# train_pred = predict(svmFit, newdata = train.svm, type="prob") %>%
#   select(-no)
# train_pred =  as.matrix(train_pred)
# train_pred[train_pred > .09] = "yes"
# train_pred[train_pred < .09] = "no"
# confusionMatrix(train_pred, train.svm$artifact)

testing

# test.svm = testing %>%
#   select(-c(sub, wave, task, run, volume, starts_with("NonSteady"))) %>%
#   mutate(trash = ifelse(trash == 1, "yes","no"),
#          trash = as.factor(trash))
# 
# # cut and assess accuracy in test sample
# test_pred = predict(svmFit, newdata = test.svm, type="prob") %>%
#   select(-no)
# test_pred =  as.matrix(test_pred)
# test_pred[test_pred > .09] = "yes"
# test_pred[test_pred < .09] = "no"
# confusionMatrix(test_pred, test.svm$artifact)